{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch16_permutation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 16"
   ],
   "metadata": {
    "id": "mzfXc9E3Xq7k"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ],
   "metadata": {
    "id": "3i9Jx5U-rsrx"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "rNPyhCFurso6"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 16.1: Analytic vs empirical H0 distribution"
   ],
   "metadata": {
    "id": "Na1QW13Jz-9T"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# x-axis grid\n",
    "x = np.linspace(-4,4,1001)\n",
    "\n",
    "# compute and normalize the analytic pdf\n",
    "analytical = stats.norm.pdf(x)\n",
    "analytical /= np.max(analytical)\n",
    "\n",
    "# same for empirical\n",
    "empirical = np.random.normal(loc=0,scale=1,size=len(x))\n",
    "yy,xx = np.histogram(empirical,bins='fd')\n",
    "yy = yy/np.max(yy)\n",
    "xx = (xx[1:]+xx[:-1])/2\n",
    "\n",
    "\n",
    "## draw the figure\n",
    "plt.figure(figsize=(6,3))\n",
    "plt.bar(xx,yy,width=.27,color=(.7,.7,.7),edgecolor=(.2,.2,.2),label='Empirical')\n",
    "plt.plot(x,analytical,'k',linewidth=3,label='Analytical')\n",
    "\n",
    "plt.xlabel('value')\n",
    "plt.yticks([])\n",
    "plt.ylabel('Density (a.u.)')\n",
    "plt.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_empVanalyH0.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "Im8zQaSn0Brz"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "AogewHNYz-29"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 16.3/4: Example in comparing two sample means"
   ],
   "metadata": {
    "id": "Hdg0zLv1vtaG"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# number of 'trials' in each condition\n",
    "n1 = 50\n",
    "n2 = 70  # note the trial inbalance!\n",
    "\n",
    "# create data\n",
    "data1 = np.random.randn(n1,1)\n",
    "data2 = np.random.randn(n2,1) + .3  # note the mean offset! This is set to .1 for Figure 16.4\n",
    "\n",
    "# pool the data into one variable (convenient for shuffling)\n",
    "alldata = np.concatenate((data1, data2))\n",
    "\n",
    "# corresponding labels\n",
    "truelabels = np.concatenate((np.ones(n1), 2*np.ones(n2)))\n",
    "\n",
    "# compute the observed condition difference\n",
    "true_conddif = np.mean(alldata[truelabels==1]) - np.mean(alldata[truelabels==2])\n",
    "\n",
    "\n",
    "### creating a null-hypothesis (H0) distribution\n",
    "\n",
    "# number of iterations for permutation testing\n",
    "nIterations = 1000\n",
    "\n",
    "# initialize output variable\n",
    "permvals = np.zeros(nIterations)\n",
    "\n",
    "for permi in range(nIterations):\n",
    "  # random permutation to swap the labels\n",
    "  shuflabels = np.random.permutation(truelabels)\n",
    "\n",
    "  # mean differences in the shuffled data\n",
    "  permvals[permi] = np.mean(alldata[shuflabels==1]) - np.mean(alldata[shuflabels==2])\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "### visualizations\n",
    "_,axs = plt.subplots(1,3,figsize=(12,4))\n",
    "\n",
    "# show the real data and means\n",
    "axs[0].plot(data1,np.zeros(n1),'ko',markersize=12,markerfacecolor=(.4,.4,.4),alpha=.5)\n",
    "axs[0].plot(data2,np.ones(n2),'ks',markersize=12,markerfacecolor=(.8,.8,.8),alpha=.5)\n",
    "axs[0].plot([np.mean(data1),np.mean(data1)],[.7,1.3],'k--',linewidth=3)\n",
    "axs[0].plot([np.mean(data2),np.mean(data2)],[-.3,.3],'k--',linewidth=3)\n",
    "axs[0].set(ylim=[-1,2],ylabel='Data series',yticks=[0,1],xlabel='Data value')\n",
    "axs[0].set_title(r'$\\bf{A})$  Real data')\n",
    "\n",
    "# show one example shuffled data\n",
    "axs[1].plot(alldata[shuflabels==1],np.zeros(n1),'ko',markersize=12,markerfacecolor=(.4,.4,.4),alpha=.5)\n",
    "axs[1].plot(alldata[shuflabels==2],np.ones(n2),'ks',markersize=12,markerfacecolor=(.8,.8,.8),alpha=.5)\n",
    "axs[1].plot([np.mean(alldata[shuflabels==1]),np.mean(alldata[shuflabels==1])],[.7,1.3],'k--',linewidth=3)\n",
    "axs[1].plot([np.mean(alldata[shuflabels==2]),np.mean(alldata[shuflabels==2])],[-.3,.3],'k--',linewidth=3)\n",
    "axs[1].set(ylim=[-1,2],ylabel='Data series',yticks=[0,1],yticklabels=['\"0\"','\"1\"'],xlabel='Data value')\n",
    "axs[1].set_title(r'$\\bf{B})$  Shuffled data')\n",
    "\n",
    "# distribution of shuffled means\n",
    "axs[2].hist(permvals,bins=40,color=[.7,.7,.7])\n",
    "axs[2].axvline(x=true_conddif,color='k',linestyle='--',linewidth=3)\n",
    "axs[2].legend(['True', 'Shuffled'],loc='lower right')\n",
    "axs[2].set(xlim=[-1,1],xlabel='Mean value',ylabel='Count')\n",
    "axs[2].set_title(r'$\\bf{C})$  Shuffling distribution')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ttestIllustrated.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "us1V2TyprsmD"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "3A8826pzrsjL"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Convert to p-value"
   ],
   "metadata": {
    "id": "YDHhEZ5xrsgD"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# based on normalized distance\n",
    "\n",
    "zVal = (true_conddif-np.mean(permvals)) / np.std(permvals,ddof=1)\n",
    "p_z  = (1-stats.norm.cdf(np.abs(zVal)))*2 # two-tailed!\n",
    "\n",
    "print(f'Z = {zVal:.3f}, p = {p_z:.3f}')"
   ],
   "metadata": {
    "id": "bfpNud28ZmZU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# based on counts\n",
    "p_c = np.sum(np.abs(permvals)>np.abs(true_conddif)) / nIterations\n",
    "\n",
    "print(f'p_c = {p_c:.3f}')"
   ],
   "metadata": {
    "id": "NbkuR8YprsdM"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "0cGchZeOoJiV"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 16.5/6: Margin figures about p-values"
   ],
   "metadata": {
    "id": "GojWWd5uoJfP"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# p_z is appropriate for (roughly) Gaussian H0 distributions\n",
    "\n",
    "H0 = np.random.normal(0,2,size=1000)\n",
    "\n",
    "plt.figure(figsize=(4,5))\n",
    "h = plt.hist(H0,bins='fd',color=(.9,.9,.9),edgecolor=(.6,.6,.6))\n",
    "plt.annotate('Observed\\nstatistic',xytext=[3,.95*np.max(h[0])],va='top',ha='center',rotation=90,size=12,\n",
    "             xy=[3,0],arrowprops={'color':'k'})\n",
    "\n",
    "plt.annotate(r'$\\overline{H_0}$',xytext=[np.mean(H0),np.max(h[0])],va='top',ha='center',rotation=0,size=15,weight='bold',\n",
    "             xy=[np.mean(H0),0],arrowprops={'color':'k','arrowstyle':'->','linestyle':'--','linewidth':3})\n",
    "plt.annotate(r'$s_{H_0}$',xytext=[-np.std(H0,ddof=1)*2,np.mean(h[0])],va='center',rotation=0,size=15,weight='bold',\n",
    "             xy=[np.mean(H0),np.mean(h[0])],arrowprops={'color':'k','arrowstyle':'-','linestyle':'--','linewidth':3})\n",
    "\n",
    "plt.text(3.5,np.max(h[0])/2,r'$z = \\frac{obs-\\overline{H_0}}{s_{H_0}}$',size=20)\n",
    "\n",
    "\n",
    "plt.xlabel('Parameter value')\n",
    "plt.ylabel('Count')\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_pz.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "1jArt4JXoMKi"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# p_c is appropriate for any shape H0 distributions\n",
    "\n",
    "H0 = np.random.exponential(2,size=1000)\n",
    "\n",
    "plt.figure(figsize=(4,5))\n",
    "h = plt.hist(H0,bins='fd',color=(.9,.9,.9),edgecolor=(.6,.6,.6))\n",
    "plt.annotate('Observed\\nstatistic',xytext=[5,.95*np.max(h[0])],va='top',ha='center',rotation=90,size=12,\n",
    "             xy=[5,0],arrowprops={'color':'k'})\n",
    "\n",
    "plt.text(6,np.max(h[0])/2,r'$p = \\frac{\\sum H_0>obs}{N_{H_0}}$',size=20)\n",
    "\n",
    "# paint bars black if greater than statistic\n",
    "for p in h[2]:\n",
    "  if p.get_x()>5: p.set_facecolor('k')\n",
    "\n",
    "plt.xlabel('Parameter value')\n",
    "plt.ylabel('Count')\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_pc.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "ZdPqn5gfoJch"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "d-RaA-SBoJZ_"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 16.7: Permutation testing for the mean of one sample"
   ],
   "metadata": {
    "id": "8nFiB6epGbiH"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create non-normal data\n",
    "N = 87\n",
    "data = stats.gamma.rvs(1.2,size=N)\n",
    "h0val = 1\n",
    "sampleMean = np.mean(data)\n",
    "\n",
    "# Note: creating the data in a separate cell lets you\n",
    "# re-run the permutation testing multiple times on the same data."
   ],
   "metadata": {
    "id": "0yx2W_O_GbSl"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# permutation testing\n",
    "\n",
    "data4perm = data - h0val # shift the problem such that H0=0\n",
    "obsMean = np.mean(data4perm)\n",
    "\n",
    "nPerms = 1000\n",
    "permMeans = np.zeros(nPerms)\n",
    "\n",
    "for permi in range(nPerms):\n",
    "\n",
    "  # create a vector of +/- 1's\n",
    "  randSigns = np.sign(np.random.randn(N))\n",
    "\n",
    "  # mean of shuffled data\n",
    "  permMeans[permi] = np.mean( randSigns*np.abs(data4perm) )\n",
    "\n",
    "\n",
    "\n",
    "# compute p-value based on extreme count\n",
    "pval = np.sum(np.abs(permMeans) > np.abs(obsMean)) / nPerms\n",
    "\n",
    "\n",
    "# show distributions\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "axs[0].hist(data,bins='fd',color=(.6,.6,.6),edgecolor='k',label='Data')\n",
    "axs[0].axvline(x=h0val,color='k',linestyle='--',linewidth=3,label=r'H$_0$ value')\n",
    "axs[0].axvline(x=sampleMean,color=(.3,.3,.3),linestyle=':',linewidth=2,label='Observed mean')\n",
    "axs[0].legend()\n",
    "axs[0].set_title(r'$\\bf{A})$  Data distribution')\n",
    "axs[0].set(xlabel='Data value',ylabel='Count')\n",
    "\n",
    "# histogram of permutations (adding back h0 value for visualization)\n",
    "axs[1].hist(permMeans+h0val,bins='fd',color=(.9,.9,.9),edgecolor='k',label='Shuffled means')\n",
    "axs[1].axvline(x=sampleMean,color=(.3,.3,.3),linestyle=':',linewidth=3,label='Observed mean')\n",
    "axs[1].set_title(rf'$\\bf{{B}})$  H$_0$ distribution (p={pval})')\n",
    "axs[1].set(xlabel='Value',ylabel='Count')\n",
    "axs[1].legend(loc='lower left')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_oneSample.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "PCvdQ-sZGbN7"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "PeCVMU9cGbGT"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 16.9: Number of iterations"
   ],
   "metadata": {
    "id": "ctAYjTwwGbC5"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "\n",
    "numberOfIterations = np.arange(10,10011,100)\n",
    "\n",
    "H0chars = np.zeros((len(numberOfIterations),3))\n",
    "\n",
    "\n",
    "for ni,nPerms in enumerate(numberOfIterations):\n",
    "\n",
    "  # permutation testing\n",
    "  permMeans = np.zeros(nPerms)\n",
    "  for permi in range(nPerms):\n",
    "    permMeans[permi] = np.mean( np.sign(np.random.randn(N))*data4perm )\n",
    "\n",
    "  # H0 distribution characteristics\n",
    "  H0chars[ni,0] = np.mean(np.abs(permMeans) > np.abs(obsMean)) # p-value\n",
    "  H0chars[ni,1] = np.mean(permMeans)                              # H0 distribution mean\n",
    "  H0chars[ni,2] = stats.iqr(permMeans)                            # distribution width (IQR)\n",
    "\n",
    "\n",
    "# plotting\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(numberOfIterations,H0chars-np.mean(H0chars,axis=0),\n",
    "         'o',markersize=10,markerfacecolor='w',linewidth=2)\n",
    "plt.legend(['P-value','H0 mean','H0 width'])\n",
    "plt.xlabel('Number of iterations')\n",
    "plt.ylabel('Distribution characteristic value')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_numIters.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "SoQlrsxGGa_r"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "3MXHyUWAI2n1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "hreQGeTdI2kN"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# simulation parameters\n",
    "N = 55\n",
    "nIterations = 1000\n",
    "\n",
    "# the data and its mean\n",
    "theData = np.random.normal(loc=0,scale=1,size=N)**2 - 1\n",
    "theMean = np.mean(theData)\n",
    "\n",
    "# one permutation test\n",
    "permMeans = np.zeros(nIterations)\n",
    "for permi in range(nIterations):\n",
    "\n",
    "  # the data with random sign flips\n",
    "  signFlippedData = np.sign(np.random.randn(N))*theData\n",
    "\n",
    "  # and its mean\n",
    "  permMeans[permi] = np.mean( signFlippedData )\n",
    "\n",
    "\n",
    "# zscore relative to H0 distribution\n",
    "zVal = (theMean-np.mean(permMeans)) / np.std(permMeans,ddof=1)\n",
    "pVal = (1-stats.norm.cdf(np.abs(zVal)))*2\n",
    "\n",
    "# print the z/p values\n",
    "print(f'z = {zVal:.2f}, p = {pVal:.3f}')"
   ],
   "metadata": {
    "id": "s1t7u0Mo4RyM"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# simulation parameters (repeated from previous cell; feel free to modify!)\n",
    "N = 55\n",
    "nPermTests = 750\n",
    "nIterations = 1000\n",
    "\n",
    "theData = np.random.normal(loc=0,scale=1,size=N)**2 - 1\n",
    "theMean = np.mean(theData)\n",
    "\n",
    "# initialize output vector\n",
    "zVals = np.zeros(nPermTests)\n",
    "\n",
    "# loop over all the permutation tests\n",
    "for ni in range(nPermTests):\n",
    "\n",
    "  # permutation testing (same as above but with fewer lines of code)\n",
    "  permMeans = np.zeros(nIterations)\n",
    "  for permi in range(nIterations):\n",
    "    permMeans[permi] = np.mean( np.sign(np.random.randn(N))*theData )\n",
    "\n",
    "  # zscore relative to H0 distribution\n",
    "  zVals[ni] = (theMean-np.mean(permMeans)) / np.std(permMeans,ddof=1)\n",
    "\n",
    "\n",
    "## plotting\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "axs[0].hist(theData,bins='fd',color=(.9,.9,.9),edgecolor='k',label='Data')\n",
    "axs[0].axvline(x=0,color='k',linestyle='--',linewidth=3,label=r'H$_0$ value')\n",
    "axs[0].axvline(x=theMean,color=(.3,.3,.3),linestyle=':',linewidth=2,label='Observed mean')\n",
    "axs[0].set(xlabel='Data values',ylabel='Counts')\n",
    "axs[0].legend()\n",
    "axs[0].set_title(r'$\\bf{A}$)  Data histogram')\n",
    "\n",
    "axs[1].hist(zVals,bins='fd',color=(.5,.5,.5),edgecolor=(.2,.2,.2))\n",
    "axs[1].set(xlabel='Z values',ylabel='Counts')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Perm-z histogram')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ex1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "i0BYwJNOI5BG"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "fRybXAsbI2gz"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "4CVS9NmjJr2j"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create non-normal data\n",
    "N = 100\n",
    "data = np.random.uniform(low=-1,high=1,size=N)\n",
    "data -= np.mean(data)\n",
    "h0val = -.11\n",
    "\n",
    "# other simulation parameters\n",
    "nPerms = 1000\n",
    "numPermTests = 1000\n",
    "\n",
    "# permutation testing\n",
    "data4perm = data - h0val\n",
    "obsMean = np.mean(data4perm)\n",
    "\n",
    "\n",
    "# initialize output variables\n",
    "permMeans = np.zeros(nPerms)\n",
    "pvals = np.zeros(numPermTests)\n",
    "\n",
    "\n",
    "# the 'outer loop' over many permutation tests\n",
    "for permRepeati in range(numPermTests):\n",
    "\n",
    "  # permutation test (copied from previous code in this notebook)\n",
    "  for permi in range(nPerms):\n",
    "    randSigns = np.sign(np.random.randn(N))\n",
    "    permMeans[permi] = np.mean( randSigns*data4perm )\n",
    "\n",
    "  # compute and store the p-value\n",
    "  pvals[permRepeati] = np.mean( np.abs(permMeans) > np.abs(obsMean) )\n",
    "\n",
    "\n",
    "\n",
    "## plotting\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "axs[0].plot(data,'ko',markerfacecolor=(.9,.9,.9),markersize=10,label='Data')\n",
    "axs[0].axhline(y=h0val,color='k',linestyle='--',linewidth=3,label=r'H$_0$ value')\n",
    "axs[0].axhline(y=0,color=(.3,.3,.3),linestyle=':',linewidth=2,label='Observed mean')\n",
    "axs[0].set(xlabel='Data index',ylabel='Data value')\n",
    "axs[0].legend(loc='upper right')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Data and prediction')\n",
    "\n",
    "h = axs[1].hist(pvals,bins='fd',color=(.7,.7,.7),edgecolor=(.2,.2,.2))\n",
    "axs[1].set(xlabel='P-values',ylabel='Counts')\n",
    "axs[1].set_title(r'$\\bf{B}$)  P$_c$ histogram ' + f'({np.mean(pvals<.05)*100:.1f}% \"sig.\")')\n",
    "axs[1].axvline(x=.05,color='k',linestyle='--')\n",
    "\n",
    "# paint bars black if \"significant\"\n",
    "for p in h[2]:\n",
    "  if p.get_x()<.05: p.set_facecolor('k')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ex2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "hISY5uSMLBCW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "dCDzRfSdm8-Y"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 3"
   ],
   "metadata": {
    "id": "i6uPyzeNqVJe"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# data parameters\n",
    "n1,n2 = 50,70\n",
    "\n",
    "# experiment parameters\n",
    "nIterations = 1000 # in each permutation test\n",
    "numRepeats = 541 # number of times to generate new data\n",
    "\n",
    "# initializations\n",
    "permvals = np.zeros(nIterations)\n",
    "truelabels = np.concatenate((np.ones(n1), 2*np.ones(n2)))\n",
    "pvals = np.zeros((numRepeats,2))\n",
    "\n",
    "\n",
    "\n",
    "# run the experiment!\n",
    "for expi in range(numRepeats):\n",
    "\n",
    "  # create new data\n",
    "  data1 = np.random.randn(n1,1)\n",
    "  data2 = np.random.randn(n2,1) + .3  # note the mean offset!\n",
    "\n",
    "  # pool the data into one variable (convenient for shuffling)\n",
    "  alldata = np.concatenate((data1, data2))\n",
    "  true_conddif = np.mean(alldata[truelabels==1]) - np.mean(alldata[truelabels==2])\n",
    "\n",
    "  ### creating a null-hypothesis (H0) distribution\n",
    "  for permi in range(nIterations):\n",
    "    shuflabels = np.random.permutation(truelabels)\n",
    "    permvals[permi] = np.mean(alldata[shuflabels==1]) - np.mean(alldata[shuflabels==2])\n",
    "\n",
    "  # p_z\n",
    "  zVal = (true_conddif-np.mean(permvals)) / np.std(permvals,ddof=1)\n",
    "  pvals[expi,0] = (1-stats.norm.cdf(np.abs(zVal)))*2 # two-tailed!\n",
    "\n",
    "  # p_c\n",
    "  pvals[expi,1] = np.sum(np.abs(permvals)>np.abs(true_conddif)) / nIterations\n",
    "\n",
    "\n",
    "\n",
    "## and the visualization\n",
    "plt.figure(figsize=(5,4))\n",
    "plt.plot(pvals[:,0],pvals[:,1],'ko',markersize=10,markerfacecolor=(.8,.8,.8),alpha=.5)\n",
    "plt.plot([0,1],[0,1],'k')\n",
    "plt.xlabel('p-values from z-score')\n",
    "plt.ylabel('p-values from counting')\n",
    "plt.title(f'r = {np.corrcoef(pvals.T)[0,1]:.3f}',loc='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ex3.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "f8pPWeHxnpp2"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Zxskz5CsJrsw"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 4"
   ],
   "metadata": {
    "id": "kekDi-ROJrpz"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "nPerms = 1000\n",
    "permMeans = np.zeros(nPerms)\n",
    "\n",
    "N = 30\n",
    "h0val = .5\n",
    "pvals = np.zeros((100,2))\n",
    "\n",
    "\n",
    "# loop over the experiment iterations\n",
    "for iter in range(100):\n",
    "\n",
    "  # create the data (shifted by h0 such that H0=0)\n",
    "  X = np.random.randn(N)**1 - h0val\n",
    "\n",
    "  # permutation testing\n",
    "  permMeans = np.zeros(nPerms)\n",
    "  for permi in range(nPerms):\n",
    "    permMeans[permi] = np.mean( np.random.choice((-1,1),N)*X )\n",
    "\n",
    "\n",
    "  # p-value from permutation testing\n",
    "  pvals[iter,0] = np.mean( np.abs(permMeans)>np.abs(np.mean(X)) )\n",
    "\n",
    "  # p-value from 1-sample ttest\n",
    "  pvals[iter,1] = stats.ttest_1samp(X,0).pvalue\n",
    "\n",
    "\n",
    "# replace p=0 with p=min\n",
    "pvals[pvals==0] = np.min(pvals[pvals>0])\n",
    "pvals = np.log(pvals)\n",
    "\n",
    "## visualization\n",
    "plt.figure(figsize=(8,5))\n",
    "plt.plot(pvals[:,0],pvals[:,1],'ko',\n",
    "         markersize=10,markerfacecolor=(.1,.1,.1),alpha=.5)\n",
    "prange = [ np.min(pvals),np.max(pvals) ] # min/max p-values for the unity line\n",
    "plt.plot([prange[0],prange[1]],[prange[0],prange[1]],'k')\n",
    "plt.xlabel('Permutation log(p)')\n",
    "plt.ylabel('Parametric log(p)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ex4.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "w2leMOG8Jrm3"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "AbcJB6AEvdTw"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 5"
   ],
   "metadata": {
    "id": "rkskuMc7vdQu"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# simulation parameters\n",
    "N = 30\n",
    "nPerms = 1000\n",
    "stdevs = np.linspace(.1,2,20)\n",
    "\n",
    "# initializations\n",
    "permMeans = np.zeros(nPerms)\n",
    "results = np.zeros((len(stdevs),3))\n",
    "H0dists = [0]*2\n",
    "\n",
    "\n",
    "## run the experiment\n",
    "for si,s in enumerate(stdevs):\n",
    "\n",
    "  # create the data\n",
    "  data = np.random.normal(.2,s,size=N)\n",
    "\n",
    "  # permutation testing\n",
    "  for permi in range(nPerms):\n",
    "    permMeans[permi] = np.mean( np.random.choice((-1,1),N)*data )\n",
    "\n",
    "  # store the t/z values\n",
    "  results[si,0] = (np.mean(data)-np.mean(permMeans)) / np.std(permMeans,ddof=1)\n",
    "  results[si,1] = stats.iqr(permMeans)\n",
    "  results[si,2] = stats.ttest_1samp(data,0).statistic\n",
    "\n",
    "  # store the extremiest H0 distributions\n",
    "  if si==0:\n",
    "    H0dists[0] = permMeans+0 # +0 makes a copy\n",
    "  elif si==(len(stdevs)-1):\n",
    "    H0dists[1] = permMeans\n",
    "\n",
    "\n",
    "## plotting!\n",
    "_,axs = plt.subplots(2,2,figsize=(10,8))\n",
    "\n",
    "axs[0,0].plot(stdevs,results[:,0],'ks-',linewidth=2,label='Permutation z')\n",
    "axs[0,0].plot(stdevs,results[:,2],'--o',color=(.6,.6,.6),linewidth=2,label='Parametric t')\n",
    "axs[0,0].legend()\n",
    "axs[0,0].set(xlabel=r'Population $\\sigma$',ylabel='t or z')\n",
    "axs[0,0].set_title(r'$\\bf{A}$)  t and z values')\n",
    "\n",
    "axs[0,1].plot(stdevs,results[:,1],'ko',markerfacecolor=(.8,.8,.8),markersize=12)\n",
    "axs[0,1].set(xlabel=r'Population $\\sigma$',ylabel=r'H$_0$ IQR')\n",
    "axs[0,1].set_title(r'$\\bf{B}$)  H$_0$ distribution width tracks $\\sigma$')\n",
    "\n",
    "# histograms of the H0 distributions\n",
    "axs[1,0].hist(H0dists[0],bins='fd',color=(.8,.8,.8),edgecolor='k')\n",
    "axs[1,0].set_title(r'$\\bf{C}$)  H$_0$ distribution when $\\sigma$=' + str(stdevs[0]))\n",
    "axs[1,1].hist(H0dists[1],bins='fd',color=(.8,.8,.8),edgecolor='k')\n",
    "axs[1,1].set_title(r'$\\bf{D}$)  H$_0$ distribution when $\\sigma$=' + str(stdevs[-1]))\n",
    "\n",
    "for a in axs[1,:]:\n",
    "  a.set(xlabel='Shuffled means',ylabel='Count',xlim=np.array([-1.1,1.1])*np.max(np.abs(H0dists[1])))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ex5.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "0qJneUSivdNt"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "lo5A-laaQVpS"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 6"
   ],
   "metadata": {
    "id": "b2fO6gvxQVsX"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# simulation parameters\n",
    "n = 50\n",
    "r = .2\n",
    "x = np.random.randn(n)\n",
    "y = np.random.randn(n)\n",
    "y = x*r + y*np.sqrt(1-r**2)\n",
    "\n",
    "# mean-center\n",
    "x -= np.mean(x)\n",
    "y -= np.mean(y)\n",
    "\n",
    "# observed correlation and dot product\n",
    "r,p = stats.pearsonr(x,y)\n",
    "dp = np.dot(x,y)\n",
    "\n",
    "\n",
    "# initialize output matrix\n",
    "permRes = np.zeros((1000,2))\n",
    "\n",
    "# permutation testing\n",
    "for permi in range(len(permRes)):\n",
    "\n",
    "  # shuffle y\n",
    "  yshuf = y[np.random.permutation(n)]\n",
    "\n",
    "  # pearson correlation\n",
    "  permRes[permi,0] = stats.pearsonr(x,yshuf)[0]\n",
    "\n",
    "  # mean-centered dot product\n",
    "  permRes[permi,1] = np.dot(x,yshuf)\n",
    "\n",
    "\n",
    "# z and p values\n",
    "z_r = ( r-np.mean(permRes[:,0])) / np.std(permRes[:,0],ddof=1)\n",
    "z_d = (dp-np.mean(permRes[:,1])) / np.std(permRes[:,1],ddof=1)\n",
    "\n",
    "p_r = (1-stats.norm.cdf(np.abs(z_r)))*2\n",
    "p_d = (1-stats.norm.cdf(np.abs(z_d)))*2\n",
    "\n",
    "\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "axs[0].hist(permRes[:,0],bins=40,color=(.8,.8,.8),edgecolor='k')\n",
    "axs[0].axvline(x=r,linestyle='--',color='k',linewidth=3)\n",
    "axs[0].set(xlabel='Correlation coefficient',ylabel='Count')\n",
    "axs[0].set_title(r'$\\bf{A}$)' + ' Correlation coeff. shuffles' + '\\n' + f'     z={z_r:.3f}, p={p_r:.3f}')\n",
    "\n",
    "axs[1].hist(permRes[:,1],bins=40,color=(.8,.8,.8),edgecolor='k')\n",
    "axs[1].axvline(x=dp,linestyle='--',color='k',linewidth=3)\n",
    "axs[1].set(xlabel='Dot product',ylabel='Count')\n",
    "axs[1].set_title(r'$\\bf{B}$)' + ' Dot product shuffles' + '\\n' + f'     z={z_d:.3f}, p={p_d:.3f}')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ex6.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "Z37CYrPzQVvX"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "d_Lmy8_kQVxy"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 7\n"
   ],
   "metadata": {
    "id": "liKfVTmCI2dK"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "anscombe = np.array([\n",
    "     # series 1     series 2      series 3       series 4\n",
    "    [10,  8.04,    10,  9.14,    10,  7.46,      8,  6.58, ],\n",
    "    [ 8,  6.95,     8,  8.14,     8,  6.77,      8,  5.76, ],\n",
    "    [13,  7.58,    13,  8.76,    13, 12.74,      8,  7.71, ],\n",
    "    [ 9,  8.81,     9,  8.77,     9,  7.11,      8,  8.84, ],\n",
    "    [11,  8.33,    11,  9.26,    11,  7.81,      8,  8.47, ],\n",
    "    [14,  9.96,    14,  8.10,    14,  8.84,      8,  7.04, ],\n",
    "    [ 6,  7.24,     6,  6.13,     6,  6.08,      8,  5.25, ],\n",
    "    [ 4,  4.26,     4,  3.10,     4,  5.39,      8,  5.56, ],\n",
    "    [12, 10.84,    12,  9.13,    12,  8.15,      8,  7.91, ],\n",
    "    [ 7,  4.82,     7,  7.26,     7,  6.42,      8,  6.89, ],\n",
    "    [ 5,  5.68,     5,  4.74,     5,  5.73,     19, 12.50, ]\n",
    "    ])\n",
    "\n",
    "nSamples = anscombe.shape[0]\n",
    "permRs = np.zeros(1000)\n",
    "\n",
    "\n",
    "# plot data and correlations\n",
    "fig,ax = plt.subplots(2,2,figsize=(8,6))\n",
    "ax = ax.ravel()\n",
    "\n",
    "for i in range(4):\n",
    "\n",
    "  # convenience\n",
    "  x = anscombe[:,i*2]\n",
    "  y = anscombe[:,i*2+1]\n",
    "\n",
    "  # plot the points\n",
    "  ax[i].plot(x,y,'ko',markersize=10,markerfacecolor=(.7,.7,.7))\n",
    "\n",
    "  # compute the correlation and parametric p-value\n",
    "  r,pp = stats.pearsonr(x,y)\n",
    "\n",
    "  # permutation testing\n",
    "  for permi in range(len(permRs)):\n",
    "    permRs[permi] = stats.pearsonr(x,y[np.random.permutation(nSamples)])[0]\n",
    "  pc = np.mean(np.abs(permRs)>=np.abs(r))\n",
    "\n",
    "\n",
    "  # update the axis\n",
    "  ax[i].set(xticks=[],yticks=[])\n",
    "  ax[i].set_title(f'$r={r:.2f}, p={pp:.3f}$ \\n $p_c={pc:.3f}$',loc='center')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('permute_ex7.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "D_qM4ZmPQVmO"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# btw, interesting to see that the possible permuted correlation values is limited\n",
    "# due to the small sample size and limited data values... not an ideal situation for\n",
    "# parametric or non-parametric analyses.\n",
    "plt.hist(permRs,bins=40,color=(.8,.8,.8),edgecolor='k');\n",
    "\n",
    "print(f'{len(permRs)} random permutations and only {len(np.unique(permRs))} unique values!')"
   ],
   "metadata": {
    "id": "OYGE6PbTI2Zi"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "S6bdYFe1vU_B"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}